{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def make_columns_lower_case(df):\n",
    "    cols = df.columns\n",
    "    df.columns = map(lambda x: str.lower(x), cols)\n",
    "    return df\n",
    "\n",
    "\n",
    "from datetime import timedelta, date\n",
    "\n",
    "def daterange(start_date, end_date):\n",
    "    for n in range(int((end_date - start_date).days)):\n",
    "        yield start_date + timedelta(n)\n",
    "\n",
    "def get_daily_values(analysis_start_date, analysis_end_date,  d):\n",
    "    rows = []\n",
    "    for single_date in daterange(analysis_start_date, analysis_end_date):\n",
    "        previous_snapshot_date =  d [ d.snapshot_date <= single_date][\"snapshot_date\"].max()\n",
    "        previous_snapshot_row = d[d.snapshot_date == previous_snapshot_date].reset_index(drop=True)\n",
    "        previous_snapshot_row[\"dt\"] = single_date\n",
    "        vacation_row = d.head(1)\n",
    "        vacation_row[\"dt\"] = single_date\n",
    "        if  single_date < d[\"start date\"].min().date():\n",
    "            rows.append(vacation_row)\n",
    "        elif single_date >= date(2020,12,27) and single_date <= date(2021,1,2):\n",
    "            rows.append(vacation_row)\n",
    "        else:\n",
    "            rows.append(previous_snapshot_row)\n",
    "\n",
    "    return pd.concat(rows).reset_index(drop=True)\n",
    "\n",
    "def remap_values(x):\n",
    "    if str.upper(str(x)).replace(\"*\", \"\") in [\"V\", \"T\"]:\n",
    "        return str.upper(str(x)).replace(\"*\", \"\")\n",
    "    else:\n",
    "        return \"H\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "###Directory for the burbio data\n",
    "burbio_data_dir=\"./data/burbio/\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the burbio data, snapshots from August 2020 - January 2021"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "school_start_dates = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  - School District Start Dates.csv\").drop_duplicates([\"COUNTY FIPS\",  \"District ID\"])\n",
    "school_start_dates = make_columns_lower_case(school_start_dates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "august_10 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  - 8-10-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "august_10 = make_columns_lower_case(august_10)\n",
    "\n",
    "august_10[\"snapshot_date\"] = \"8/10/2020\"\n",
    "august_10[\"gr 6-8\"] = august_10[\"6-8\"] \n",
    "august_10[\"gr 9-12\"] = august_10[\"9-12\"] \n",
    "\n",
    "\n",
    "august_24 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  - 8-24-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "august_24 = make_columns_lower_case(august_24)\n",
    "august_24[\"snapshot_date\"] = \"8/24/2020\"\n",
    "august_24[\"gr 6-8\"] = august_24[\"6-8\"] \n",
    "august_24[\"gr 9-12\"] = august_24[\"9-12\"] \n",
    "\n",
    "september_9 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  - 9-8-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "september_9 = make_columns_lower_case(september_9)\n",
    "september_9[\"snapshot_date\"] = \"9/8/2020\"\n",
    "september_9[\"gr 6-8\"] = september_9[\"6-8\"] \n",
    "september_9[\"gr 9-12\"] = september_9[\"9-12\"] \n",
    "\n",
    "september_16 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  - 9-16-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "september_16 = make_columns_lower_case(september_16)\n",
    "september_16[\"snapshot_date\"] = \"9/16/2020\"\n",
    "september_16[\"gr 6-8\"] = september_16[\"6-8\"] \n",
    "september_16[\"gr 9-12\"] = september_16[\"9-12\"] \n",
    "\n",
    "october_16 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  - 10-16-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "october_16 = make_columns_lower_case(october_16)\n",
    "october_16[\"snapshot_date\"] = \"10/16/2020\"\n",
    "october_16[\"gr 6-8\"] = october_16[\"6-8\"] \n",
    "october_16[\"gr 9-12\"] = october_16[\"9-12\"] \n",
    "\n",
    "november_6 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  - 11-6-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "november_6 = make_columns_lower_case(november_6)\n",
    "november_6[\"snapshot_date\"] = \"11/6/2020\"\n",
    "november_6[\"gr 6-8\"] = november_6[\"6-8\"] \n",
    "november_6[\"gr 9-12\"] = november_6[\"9-12\"] \n",
    "\n",
    "december_4 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  -  12-4-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "december_4 = make_columns_lower_case(december_4)\n",
    "december_4[\"snapshot_date\"] = \"12/4/2020\"\n",
    "\n",
    "\n",
    "december_11 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  -  12-11-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "december_11 = make_columns_lower_case(december_11)\n",
    "december_11[\"snapshot_date\"] = \"12/11/2020\"\n",
    "\n",
    "\n",
    "december_18 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  -  12-18-2020.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "december_18 = make_columns_lower_case(december_18)\n",
    "december_18[\"snapshot_date\"] = \"12/18/2020\"\n",
    "\n",
    "january_1 = pd.read_csv(burbio_data_dir + \"My Copy of New** Compiled Burbio Spreadsheet  -  1-1-2021.csv\").drop_duplicates([\"FIPS\",  \"District ID\"])\n",
    "january_1 = make_columns_lower_case(january_1)\n",
    "january_1[\"snapshot_date\"] = \"1/1/2021\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the fips code regin mapping data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "fips_region = pd.read_csv(\"data/Fips Regions Divisions - 210625.csv\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combine burbio snapshots into 1 data frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "school_data = pd.concat([august_10, august_24, september_9, september_16, october_16, november_6, december_4, december_11, december_18, january_1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Add school start dates "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data  = pd.merge(school_data, school_start_dates[[\"county fips\",  \"district id\", \"start date\"]], right_on = [\"county fips\",  \"district id\"], left_on=[\"fips\", \"district id\"], how=\"left\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data[\"snapshot_date\"] = pd.to_datetime(all_data.snapshot_date)\n",
    "all_data[\"start date\"] = pd.to_datetime(all_data[\"start date\"])\n",
    "\n",
    "#### Data clean up\n",
    "all_data[\"gr 9-12\"] = all_data[\"gr 9-12\"].apply(remap_values)\n",
    "all_data[\"gr 6-8\"] = all_data[\"gr 6-8\"].apply(remap_values)\n",
    "all_data[\"k-5\"] = all_data[\"k-5\"].apply(remap_values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### group by data for each district and fips code (i.e., county)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "distrcit_fips = all_data[[\"district id\",\"fips\", \"k-5\"]].groupby([\"district id\",\"fips\"]).count().sort_values([\"k-5\"]).reset_index()\n",
    "analysis_start_date = pd.to_datetime(\"6/1/2020\").date()\n",
    "analysis_end_date = date(2021, 2, 10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert pandas date coumns from datetime to date type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data[\"start_date\"] = all_data[\"start date\"].dt.date\n",
    "all_data[\"snapshot_date\"] = all_data[\"snapshot_date\"].dt.date"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Populate daily data by filling out the in between snapshots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/hoktay/.pyenv/versions/3.7.3/envs/ertem-paper-environment/lib/python3.7/site-packages/pandas/core/sorting.py:403: FutureWarning: Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior.  In a future version these will be considered non-comparable.Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.\n",
      "  indexer = non_nan_idx[non_nans.argsort(kind=kind)]\n",
      "/Users/hoktay/.pyenv/versions/3.7.3/envs/ertem-paper-environment/lib/python3.7/site-packages/pandas/core/ops/array_ops.py:73: FutureWarning: Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior.  In a future version these will be considered non-comparable.Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.\n",
      "  result = libops.scalar_compare(x.ravel(), y, op)\n",
      "/Users/hoktay/.pyenv/versions/3.7.3/envs/ertem-paper-environment/lib/python3.7/site-packages/pandas/core/frame.py:3607: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._set_item(key, value)\n",
      "/Users/hoktay/.pyenv/versions/3.7.3/envs/ertem-paper-environment/lib/python3.7/site-packages/numpy/core/_methods.py:40: FutureWarning: Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior.  In a future version these will be considered non-comparable.Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.\n",
      "  return umr_maximum(a, axis, None, out, keepdims, initial, where)\n"
     ]
    }
   ],
   "source": [
    "daily_dfs = []\n",
    "for i, r in distrcit_fips.iterrows():\n",
    "    if r[\"k-5\"] !=10:\n",
    "        continue\n",
    "    d = all_data[(all_data[\"district id\"] == r[\"district id\"]) & (all_data[\"fips\"] == r.fips)][[\"district id\",\"snapshot_date\", \"fips\",  \"start date\", \"gr 6-8\",\"k-5\",\"gr 9-12\"]]\n",
    "    start_row = d.head(1).reset_index(drop = True)\n",
    "    start_row[\"snapshot_date\"] = pd.to_datetime(analysis_start_date)\n",
    "    start_row[\"gr 9-12\"] = \"Vac\"\n",
    "    start_row[\"gr 6-8\"] = \"Vac\"\n",
    "    start_row[\"k-5\"] = \"Vac\"\n",
    "    d = d.append(start_row).sort_values([\"snapshot_date\"]).reset_index(drop=True)\n",
    "    daily_dfs.append(get_daily_values(analysis_start_date, analysis_end_date, d))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combine all the data to one dataframe and save as a csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "burbio_daily_data = pd.concat(daily_dfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "burbio_daily_data.to_csv(\"./data/burbio/burbio_daily_data.csv\", header=True, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
